library(Rwordseg)
library(tm)
library(jiebaR)
library(jiebaRD)
library(dplyr)
library(ggplot2)
library(readr)
library(stringr)
library(coefplot)
library(tidyr)
library(xtable)
library(stargazer)
library(ggpubr)
library(estimatr)

#------------------------------------------------------------------------------------------
## Data Pre-processing and segmentation 
#------------------------------------------------------------------------------------------

# Step 1.1: read file and data preparation
Mob_India_1962 <- read_csv("mobilization_campaign_India_1962.csv")
Mob_Soviet_1969 <- read_csv("mobilization_campaign_Soviet_1969.csv")
Mob_Vietnam_1974 <- read_csv("mobilization_campaign_Vietnam_1974.csv")
Mob_Vietnam_1979 <- read_csv("mobilization_campaign_Vietnam_1979.csv")
Mob_Vietnam_1979 <- Mob_Vietnam_1979[,1:7]


Mob_Combined <- rbind(Mob_India_1962, Mob_Soviet_1969, Mob_Vietnam_1974, Mob_Vietnam_1979)
Mob_Ori <- Mob_Combined[7]

Pac_Japan_1990 <- read_csv("pacification_campaign_Japan_1990.csv")
Pac_Japan_1996 <- read_csv("pacification_campaign_Japan_1996.csv")
Pac_Japan_2005 <- read_csv("pacification_campaign_Japan_2005.csv")
Pac_Japan_2010 <- read_csv("pacification_campaign_Japan_2010.csv")
Pac_Japan_2012 <- read_csv("pacification_campaign_Japan_2012.csv")
Pac_Philippines_2016 <- read_csv("pacification_campaign_Philippines_2016.csv")
Pac_India_2017 <- read_csv("pacification_campaign_India_2017.csv")

Pac_Combined <- rbind(Pac_Japan_1990, Pac_Japan_1996, Pac_Japan_2005,
                      Pac_Japan_2010, Pac_Japan_2012, Pac_Philippines_2016,
                      Pac_India_2017)
Pac_Ori <- Pac_Combined[7]


# Convert text data into string
Mob <- c(NULL)
for (i in 1:dim(Mob_Ori)[1]){
  Mob[i] <- as.String(Mob_Ori[i,1])
}
Pac <- c(NULL)
for (i in 1:dim(Pac_Ori)[1]){
  Pac[i] <- as.String(Pac_Ori[i,1])
}


# Step 1.2: word segmentation/ Tokenization
# This step may take long time, can use the output file directly

wk <- worker() # Function for tokenization

Tokenization_Mob <- as.data.frame(1:length(Mob)) # create dataframe to store words

for (i in 1:length(Mob)){
  Tok_Mob <- segment(Mob[i],wk)
  for (t in 1:length(Tok_Mob)){
    Tokenization_Mob[i,t+1] <- Tok_Mob[t]
  }  
}

write_excel_csv(Tokenization_Mob,file = "Tokenization_Mob.csv")

Tokenization_Pac <- as.data.frame(1:length(Pac))

for (i in 1:length(Pac)){
  Tok_Pac <- segment(Pac[i],wk)
  for (t in 1:length(Tok_Pac)){
    Tokenization_Pac[i,t+1] <- Tok_Pac[t]
  }  
}

write_excel_csv(Tokenization_Pac,file = "Tokenization_Pac.csv")

#------------------------------------------------------------------------------------------
## Method 1: NTU dictionary 
#------------------------------------------------------------------------------------------

Positive <- read_csv("NTUSD_positive_simplified.csv",col_names = FALSE)
Tok_Mob <- read_csv("Tokenization_Mob.csv") # because tokenization takes a long time, this is directly using the output files from tokenization
Tok_Pac <- read_csv("Tokenization_Pac.csv")

Positive_CN <- c(NULL) # Convert dictionary to string
for (i in 1:dim(Positive)[1]){
  Positive_CN <- c(Positive_CN,Positive[i,1])
}

wk <- worker() # Function for tokenization

Positive_Score_Mob <- c(NULL) # Calculate sentiment scores
for (i in 1:length(Mob)){
  Tok_Mob <- segment(Mob[i],wk)
  Positive_Score_Mob[i] <- sum(!is.na(match(Tok_Mob,Positive_CN)))
}

Positive_Score_Pac <- c(NULL)
for (i in 1:length(Pac)){
  Tok_Pac <- segment(Pac[i],wk)
  Positive_Score_Pac[i] <- sum(!is.na(match(Tok_Pac,Positive_CN)))
}

Negative <- read_csv("NTUSD_negative_simplified.csv",col_names = FALSE)

Negative_CN <- c(NULL)
for (i in 1:dim(Negative)[1]){
  Negative_CN <- c(Negative_CN,Negative[i,1])
}

Negative_Score_Mob <- c(NULL)
for (i in 1:length(Mob)){
  Tok_Mob <- segment(Mob[i],wk)
  Negative_Score_Mob[i] <- sum(!is.na(match(Tok_Mob,Negative_CN)))
}

Negative_Score_Pac <- c(NULL)
for (i in 1:length(Pac)){
  Tok_Pac <- segment(Pac[i],wk)
  Negative_Score_Pac[i] <- sum(!is.na(match(Tok_Pac,Negative_CN)))
}

word_count_Mob <- c(NULL) # Total number of words
for (i in 1:length(Mob)){
  Tok_Mob <- segment(Mob[i],wk)
  word_count_Mob[i] <- length(Tok_Mob)
}

word_count_Pac <- c(NULL)
for (i in 1:length(Pac)){
  Tok_Pac <- segment(Pac[i],wk)
  word_count_Pac[i] <- length(Tok_Pac)
}

# Combine outputs in one dataframe
NTU_Sentiment_Score_Mob <- as.data.frame(Positive_Score_Mob)
NTU_Sentiment_Score_Mob$Negative_Score_Mob <- Negative_Score_Mob
NTU_Sentiment_Score_Mob$Word_Count_Mob <- word_count_Mob

NTU_Sentiment_Score_Pac <- as.data.frame(Positive_Score_Pac)
NTU_Sentiment_Score_Pac$Negative_Score_Pac <- Negative_Score_Pac
NTU_Sentiment_Score_Pac$Word_Count_Pac <- word_count_Pac

# Score normalization
NTU_Sentiment_Score_Mob$Score_Mob <-(NTU_Sentiment_Score_Mob$Positive_Score_Mob - 
                                       NTU_Sentiment_Score_Mob$Negative_Score_Mob)/NTU_Sentiment_Score_Mob$Word_Count_Mob

NTU_Sentiment_Score_Pac$Score_Pac <-(NTU_Sentiment_Score_Pac$Positive_Score_Pac - 
                                       NTU_Sentiment_Score_Pac$Negative_Score_Pac)/NTU_Sentiment_Score_Pac$Word_Count_Pac
# Save the outputs 
write.csv(NTU_Sentiment_Score_Mob,file = "NTU_Sentiment_Score_Mob.csv")
write.csv(NTU_Sentiment_Score_Pac,file = "NTU_Sentiment_Score_Pac.csv")

# Data preparation for graphs
## Create campaign vectors
Mob_Combined <- rbind(Mob_India_1962, Mob_Soviet_1969, Mob_Vietnam_1974, Mob_Vietnam_1979)
Pac_Combined <- rbind(Pac_Japan_1990, Pac_Japan_1996, Pac_Japan_2005,
                      Pac_Japan_2010, Pac_Japan_2012, Pac_Philippines_2016,
                      Pac_India_2017)

Mob_India_1962_Vec <- rep("India_1962",dim(Mob_India_1962)[1]) 
Mob_Soviet_1969_Vec <- rep("Soviet_1969",dim(Mob_Soviet_1969)[1])
Mob_Vietnam_1974 <- rep("Vietnam_1974",dim(Mob_Vietnam_1974)[1])
Mob_Vietnam_1979 <- rep("Vietnam_1979",dim(Mob_Vietnam_1979)[1])

Mob_Campaigns <- c(Mob_India_1962_Vec,Mob_Soviet_1969_Vec,Mob_Vietnam_1974,Mob_Vietnam_1979)
NTU_Sentiment_Score_Mob$Campaigns <- Mob_Campaigns # Add new campaign column

# Combine time elements and convert into "yyyy-mm-dd" format
Mob_Combined$Year <- as.character(Mob_Combined$Year)

Mob_Combined$Month <- as.character(Mob_Combined$Month)
Mob_Combined$Month <- str_pad(Mob_Combined$Month,2,side = "left","0") # two digits for month

Mob_Combined$Day <- as.character(Mob_Combined$Day)
Mob_Combined$Day <- str_pad(Mob_Combined$Day,2,side = "left","0") # two digits for day

Mob_Date <- paste(Mob_Combined$Year,Mob_Combined$Month,Mob_Combined$Day,sep="-") # combine and connect with "-"
Mob_Date <- as.Date(Mob_Date) # Convert into time

NTU_Sentiment_Score_Mob$Date <- Mob_Date # Add new date column

Pac_Japan_1990_Vec <- rep("Japan_1990",dim(Pac_Japan_1990)[1])# Create campaign vector
Pac_Japan_1996_Vec <- rep("Japan_1996",dim(Pac_Japan_1996)[1])
Pac_Japan_2005_Vec <- rep("Japan_2005",dim(Pac_Japan_2005)[1])
Pac_Japan_2010_Vec <- rep("Japan_2010",dim(Pac_Japan_2010)[1])
Pac_Japan_2012_Vec <- rep("Japan_2012",dim(Pac_Japan_2012)[1])
Pac_Philippines_2016_Vec <- rep("Philippines_2016",dim(Pac_Philippines_2016)[1])
Pac_India_2017_Vec <- rep("India_2017",dim(Pac_India_2017)[1])  

Pac_Campaigns <- c(Pac_Japan_1990_Vec,Pac_Japan_1996_Vec,Pac_Japan_2005_Vec,
                   Pac_Japan_2010_Vec, Pac_Japan_2012_Vec, Pac_Philippines_2016_Vec,Pac_India_2017_Vec)
NTU_Sentiment_Score_Pac$Campaigns <- Pac_Campaigns # Add new campaign column

Pac_Combined$Year <- as.character(Pac_Combined$Year)

Pac_Combined$Month <- as.character(Pac_Combined$Month)
Pac_Combined$Month <- str_pad(Pac_Combined$Month,2,side = "left","0")

Pac_Combined$Day <- as.character(Pac_Combined$Day)
Pac_Combined$Day <- str_pad(Pac_Combined$Day,2,side = "left","0")

Pac_Date <- paste(Pac_Combined$Year,Pac_Combined$Month,Pac_Combined$Day,sep="-")
Pac_Date <- as.Date(Pac_Date)

NTU_Sentiment_Score_Pac$Date <- Pac_Date # Add new date column

# Save the outputs 
write.csv(NTU_Sentiment_Score_Mob,file = "NTU_Sentiment_Score_Mob_Campaign_Date.csv")
write.csv(NTU_Sentiment_Score_Pac,file = "NTU_Sentiment_Score_Pac_Campaign_Date.csv")

# If need to use the outputs directly
NTU_Scores_Mob <- read_csv("NTU_Sentiment_Score_Mob_Campaign_Date.csv")
NTU_Scores_Pac <- read_csv("NTU_Sentiment_Score_Pac_Campaign_Date.csv")

# Append the mob and pac datasets with NTU sentiment scores
NTU_Scores_Mob$Campaign_Type <- "Mobilization Campaigns"
NTU_Scores_Pac$Campaign_Type <- "Pacification Campaigns"

NTU_Scores_Mob$Positive_Score = NTU_Scores_Mob$Positive_Score_Mob
NTU_Scores_Mob$Negative_Score = NTU_Scores_Mob$Negative_Score_Mob 
NTU_Scores_Mob$Word_Count = NTU_Scores_Mob$Word_Count_Mob
NTU_Scores_Mob$Aggregated_Score = NTU_Scores_Mob$Score_Mob

a <- select(NTU_Scores_Mob, Campaigns, Date, Positive_Score, Negative_Score, Word_Count, Aggregated_Score, Campaign_Type)

NTU_Scores_Pac$Positive_Score = NTU_Scores_Pac$Positive_Score_Pac
NTU_Scores_Pac$Negative_Score = NTU_Scores_Pac$Negative_Score_Pac 
NTU_Scores_Pac$Word_Count = NTU_Scores_Pac$Word_Count_Pac
NTU_Scores_Pac$Aggregated_Score = NTU_Scores_Pac$Score_Pac

b <- select(NTU_Scores_Pac, Campaigns, Date, Positive_Score, Negative_Score, Word_Count, Aggregated_Score, Campaign_Type)

ALL <- rbind(a, b)

colnames(ALL) <- c('Campaigns','Date','Positive_Score','Negative_Score','Word_Count','Aggregated_Score','Campaign Type')

write.csv(ALL,file = "NTU_Sentiment_Scores_All.csv")

# ==================== Table 7.5 =====================================================
# If using outputs directly
ALL <- read_csv("NTU_Sentiment_Scores_All.csv")

# Calculating Means by campaign types
NTU_Scores_Mob_mean <- NTU_Scores_Mob %>%
  summarise(mob_positive_average=mean(Positive_Score_Mob),
            mob_negative_average=mean(Negative_Score_Mob),
            mob_aggregated_average=mean(Aggregated_Score))

NTU_Scores_Pac_mean <- NTU_Scores_Pac %>%
  summarise(pac_positive_average=mean(Positive_Score_Pac),
            pac_negative_average=mean(Negative_Score_Pac),
            pac_aggregated_average=mean(Aggregated_Score))

##Putting all means in a comparison table
mean_data <- cbind(c(NTU_Scores_Mob_mean$mob_positive_average,NTU_Scores_Mob_mean$mob_negative_average, NTU_Scores_Mob_mean$mob_aggregated_average),
                   c(NTU_Scores_Pac_mean$pac_positive_average,NTU_Scores_Pac_mean$pac_negative_average, NTU_Scores_Pac_mean$pac_aggregated_average))

colnames(mean_data) <- c("Mobilization Campaign","Pacification Campaign")
rownames(mean_data) <- c("Positive","Negative","Aggregated")
# xtable(mean_data) # Same with stargazer
stargazer(mean_data, type = 'latex', title = 'Average of sentiment scores')#Put Latex codes into Latex to generate table

# ttests
t.test(NTU_Scores_Mob$Positive_Score_Mob, NTU_Scores_Pac$Positive_Score_Pac)
t.test(NTU_Scores_Mob$Negative_Score_Mob, NTU_Scores_Pac$Negative_Score_Pac)
t.test(NTU_Scores_Mob$Score_Mob, NTU_Scores_Pac$Score_Pac)

# ANOVA
oneway.test(Positive_Score ~ `Campaign Type`, data = ALL) 
oneway.test(Negative_Score ~ `Campaign Type`, data = ALL) 
oneway.test(Aggregated_Score ~ `Campaign Type`, data = ALL) 
# Same results with those of the ttests

# ==================== Figure 7.2 =====================================================
summary_Y1 <-
  ALL %>% group_by(`Campaign Type`) %>%
  do(tidy(lm_robust(Positive_Score ~ 1, data = .))) %>%
  mutate(Positive_Score = estimate)

summary_Y2 <-
  ALL %>% group_by(`Campaign Type`) %>%
  do(tidy(lm_robust(Negative_Score ~ 1, data = .))) %>%
  mutate(Negative_Score = estimate)

summary_Y3 <-
  ALL %>% group_by(`Campaign Type`) %>%
  do(tidy(lm_robust(Aggregated_Score ~ 1, data = .))) %>%
  mutate(Aggregated_Score = estimate)

positive <- ggplot(data = ALL, aes(x = `Campaign Type`,y = Positive_Score,colour = `Campaign Type`)) +
            geom_point(position = position_jitter(width = .25, height = .25),
            alpha = 0.2, stroke = 0, colour = 'azure4')+
            geom_point(data = summary_Y1)+
            geom_errorbar(data = summary_Y1, aes(ymin = conf.low, ymax = conf.high),
                width = 0)+
            ylab("")+ ylim(-3,50)+
            theme(strip.background = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(face = "bold"),
            legend.position = 'none')+
            scale_color_manual(values = c('indianred','seagreen'))


negative <- ggplot(data = ALL, aes(x = `Campaign Type`,y = Negative_Score*(-1),colour = `Campaign Type`)) +
            geom_point(position = position_jitter(width = .25, height = .25),
             alpha = 0.2, stroke = 0, colour = 'azure4')+
            geom_point(data = summary_Y2)+
            geom_errorbar(data = summary_Y2, aes(ymin = conf.low*(-1), ymax = conf.high*(-1)),
                width = 0)+
            ylab("")+ylim(-50,3)+
            theme(strip.background = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(face = "bold"),
            legend.position = 'none') +
            scale_color_manual(values = c('indianred','seagreen'))

aggregated <- ggplot(data = ALL, aes(x = `Campaign Type`,y = Aggregated_Score*100,colour = `Campaign Type`)) +
              geom_point(position = position_jitter(width = .25, height = .25),
              alpha = 0.2, stroke = 0, colour = 'azure4')+
              geom_point(data = summary_Y3)+
              geom_errorbar(data = summary_Y3, aes(ymin = conf.low*100, ymax = conf.high*100),
                width = 0)+
              ylab("")+ylim(-25,25)+
              theme(strip.background = element_blank(),
            axis.title.x = element_blank(),
            axis.text.x = element_text(face = "bold"),
            legend.position = 'none')+
            scale_color_manual(values = c('indianred','seagreen'))

figure <- ggarrange(positive, negative, aggregated,
                    labels = c('Positive Scores','Negative Scores','Aggregated Scores'),
                    ncol = 3, nrow = 1, common.legend = TRUE, legend = 'right')

figure

# ==================== Figure 7.3 =====================================================
innerCI.n <- 1.64 #set so exclusion of CI implies two sided significance of 0.1
outerCI.n <- 1.96 #set so exclusion of CI implies two sided significance of 0.05

positive_coef <- coefplot(lm(Positive_Score ~ `Campaign Type` -1, data=ALL),
                 newNames = c('`Campaign Type`Pacification Campaigns' = 'Pacification Campaigns',
                              '`Campaign Type`Mobilization Campaigns'= 'Mobilization Campaigns'), 
                 intercept=FALSE, lwdOuter=0.5, innerCI=innerCI.n, outerCI= outerCI.n, color = 'black',title = '') + theme_bw()
positive_coef$data$Coefficient <- factor(positive_coef$data$Coefficient,
                                         levels = rev(c('Mobilization Campaigns',
                                                        'Pacification Campaigns')))
positive_coef

negative_coef <- coefplot(lm(Negative_Score*(-1) ~ `Campaign Type` -1, data=ALL),
                          newNames = c('`Campaign Type`Pacification Campaigns' = 'Pacification Campaigns',
                                       '`Campaign Type`Mobilization Campaigns'= 'Mobilization Campaigns'), 
                          intercept=FALSE, lwdOuter=0.5, innerCI=innerCI.n, outerCI= outerCI.n, color = 'black',title = '') + theme_bw()
negative_coef$data$Coefficient <- factor(negative_coef$data$Coefficient,
                                         levels = rev(c('Mobilization Campaigns',
                                                        'Pacification Campaigns')))
negative_coef

aggregated_coef <- coefplot(lm(Aggregated_Score ~ `Campaign Type` -1, data=ALL),
                          newNames = c('`Campaign Type`Pacification Campaigns' = 'Pacification Campaigns',
                                       '`Campaign Type`Mobilization Campaigns'= 'Mobilization Campaigns'), 
                          intercept=FALSE, lwdOuter=0.5, innerCI=innerCI.n, outerCI= outerCI.n, color = 'black',title = '') + theme_bw()
aggregated_coef$data$Coefficient <- factor(aggregated_coef$data$Coefficient,
                                         levels = rev(c('Mobilization Campaigns',
                                                        'Pacification Campaigns')))
aggregated_coef

figure_coef <- ggarrange(positive_coef, negative_coef, aggregated_coef,
                         labels = c('Postive Scores','Negative Scores','Aggregated Scores'),
                         ncol = 1, nrow = 3)
figure_coef

# ==================== Figure 7.4 =====================================================
labels_boxplot <- c(India_1962 = 'India 1962', Soviet_1969 = 'Soviet 1969', Vietnam_1974 = 'Vietnam 1974',
                    Vietnam_1979 = 'Vietnam 1979', Japan_1990 = 'Japan 1990',Japan_1996 = 'Japan 1996',
                    Japan_2005 = 'Japan 2005', Japan_2010 = 'Japan 2010', Japan_2012 = 'Japan 2012',
                    Philippines_2016 = 'Philippines 2016', India_2017 = 'India 2017')

boxplot <- ggplot(ALL, aes(x = Aggregated_Score, y = Campaigns, colour = `Campaign Type`)) +
  geom_jitter(shape = 15, alpha = 0.15,
              color = "steelblue",
              position = position_jitter(0.21)) +
  geom_boxplot()+
  geom_vline(aes(xintercept = 0),colour = 'black',linetype = 'dashed')+
  ylab("Media Campaigns")+
  scale_color_manual(values = c('indianred','seagreen'))

boxplot$data$Campaigns <- factor(boxplot$data$Campaigns, 
                                 levels = rev(c("India_1962","Soviet_1969","Vietnam_1974",
                                                "Vietnam_1979","Japan_1990","Japan_1996",
                                                "Japan_2005","Japan_2010","Japan_2012",
                                                "Philippines_2016","India_2017")),
                                 ordered = TRUE)

boxplot <- boxplot + scale_y_discrete(labels = labels_boxplot)

boxplot
#note that this is showing the median, interquartile range, the min & max, and jitterplot.

# ==================== Figure 7.5 =====================================================
model <- lm(Aggregated_Score ~ Campaigns -1, data=ALL)
coef_graph <-coefplot(model, 
              intercept=FALSE, lwdOuter=0.5, innerCI=innerCI.n, outerCI= outerCI.n,
              color = 'black',
              newNames = c(CampaignsIndia_1962 = "India 1962", CampaignsSoviet_1969 = "Soviet 1969",
                           CampaignsVietnam_1974 = "Vietnam 1974", CampaignsVietnam_1979 = "Vietnam 1979",
                           CampaignsJapan_1990 = "Japan 1990",CampaignsJapan_1996 = "Japan 1996",
                           CampaignsJapan_2005 = "Japan 2005",CampaignsJapan_2010 = "Japan 2010",
                           CampaignsJapan_2012 = "Japan 2012",CampaignsPhilippines_2016 = "Philippines 2016",
                           CampaignsIndia_2017 = "India 2017")) + 
              theme_bw()+
              ggtitle("Estimated Effect of Campaign on Aggregated NTU Sentiment Score") +
              xlab("Sentiment Score")

coef_graph$data$Coefficient <-factor(coef_graph$data$Coefficient,
                                     levels = rev(c("India 1962","Soviet 1969","Vietnam 1974",
                                                    "Vietnam 1979","Japan 1990","Japan 1996",
                                                    "Japan 2005","Japan 2010","Japan 2012",
                                                    "Philippines 2016","India 2017")))
coef_graph
